Library Imports

from pyspark.sql import SparkSession
from pyspark.sql import types as T

from pyspark.sql import functions as F

from datetime import datetime
from decimal import Decimal

Template

spark = (
    SparkSession.builder
    .master("local")
    .appName("Section 1.3 - Maps and Dictionaries")
    .config("spark.some.config.option", "some-value")
    .getOrCreate()
)

sc = spark.sparkContext

def get_csv_schema(*args):
    return T.StructType([
        T.StructField(*arg)
        for arg in args
    ])

def read_csv(fname, schema):
    return spark.read.csv(
        path=fname,
        header=True,
        schema=get_csv_schema(*schema)
    )

import os

data_path = "/data/pets.csv"
base_path = os.path.dirname(os.getcwd())
path = base_path + data_path
pets = spark.read.csv(path, header=True)
pets.show()
+---+--------+--------+-------------------+---+-----+------+
| id|breed_id|nickname|           birthday|age|color|weight|
+---+--------+--------+-------------------+---+-----+------+
|  1|       1|    King|2014-11-22 12:30:31|  5|brown|  10.0|
|  2|       3|   Argus|2016-11-22 10:05:10| 10| null|   5.5|
|  3|       1|  Chewie|2016-11-22 10:05:10| 15| null|    12|
|  3|       2|   Maple|2018-11-22 10:05:10| 17|white|   3.4|
|  4|       2|    null|2019-01-01 10:05:10| 13| null|    10|
+---+--------+--------+-------------------+---+-----+------+

Maps and Dictionaries

Case 1: Creating a Mapping from Existing Columns

(
    pets
    .fillna({
        'nickname': 'Unknown Name',
        'age':      'Unknown Age',
    })
    .withColumn('{nickname:age}', F.create_map(F.col('nickname'), F.col('age')))
    .withColumn('{nickname:age} 2', F.create_map('nickname', 'age'))
    .show()
)
+---+--------+------------+-------------------+---+-----+------+--------------------+--------------------+
| id|breed_id|    nickname|           birthday|age|color|weight|      {nickname:age}|    {nickname:age} 2|
+---+--------+------------+-------------------+---+-----+------+--------------------+--------------------+
|  1|       1|        King|2014-11-22 12:30:31|  5|brown|  10.0|         [King -> 5]|         [King -> 5]|
|  2|       3|       Argus|2016-11-22 10:05:10| 10| null|   5.5|       [Argus -> 10]|       [Argus -> 10]|
|  3|       1|      Chewie|2016-11-22 10:05:10| 15| null|    12|      [Chewie -> 15]|      [Chewie -> 15]|
|  3|       2|       Maple|2018-11-22 10:05:10| 17|white|   3.4|       [Maple -> 17]|       [Maple -> 17]|
|  4|       2|Unknown Name|2019-01-01 10:05:10| 13| null|    10|[Unknown Name -> 13]|[Unknown Name -> 13]|
+---+--------+------------+-------------------+---+-----+------+--------------------+--------------------+

What Happened?

You can create a column of map types using either columnary expressions (we'll learn what column expressions are later) or column names.

Case 2: Creating a Mapping from Constant Values

(
    pets
    .fillna({
        'nickname': 'Unknown Name',
        'age':      'Unknown Age',
    })
    .withColumn('{nickname:age}', F.create_map(F.lit('key'), F.lit('value')))
    .show()
)
+---+--------+------------+-------------------+---+-----+------+--------------+
| id|breed_id|    nickname|           birthday|age|color|weight|{nickname:age}|
+---+--------+------------+-------------------+---+-----+------+--------------+
|  1|       1|        King|2014-11-22 12:30:31|  5|brown|  10.0|[key -> value]|
|  2|       3|       Argus|2016-11-22 10:05:10| 10| null|   5.5|[key -> value]|
|  3|       1|      Chewie|2016-11-22 10:05:10| 15| null|    12|[key -> value]|
|  3|       2|       Maple|2018-11-22 10:05:10| 17|white|   3.4|[key -> value]|
|  4|       2|Unknown Name|2019-01-01 10:05:10| 13| null|    10|[key -> value]|
+---+--------+------------+-------------------+---+-----+------+--------------+

What Happened?

You can create a column of map types of literals using the columnary expression F.lit(), we will learn this later on. Notice that each key/value needs to be a columnal expression? This will be a common theme throughout Spark.

Summary

  • It is very simple to create map data in Spark.
  • You can do so with both existing columns or constant values.
  • If constant values are used, then each value must be a columnary expression.

results matching ""

    No results matching ""